Import modules¶

In [17]:
import shap
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import plot_roc_curve
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn import set_config
set_config(display="diagram")
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 1000)

Read dataset¶

In [2]:
data = pd.read_csv("dataset.csv")
data.head(3)
Out[2]:
android.permission.GET_ACCOUNTS com.sonyericsson.home.permission.BROADCAST_BADGE android.permission.READ_PROFILE android.permission.MANAGE_ACCOUNTS android.permission.WRITE_SYNC_SETTINGS android.permission.READ_EXTERNAL_STORAGE android.permission.RECEIVE_SMS com.android.launcher.permission.READ_SETTINGS android.permission.WRITE_SETTINGS com.google.android.providers.gsf.permission.READ_GSERVICES ... com.android.launcher.permission.UNINSTALL_SHORTCUT com.sec.android.iap.permission.BILLING com.htc.launcher.permission.UPDATE_SHORTCUT com.sec.android.provider.badge.permission.WRITE android.permission.ACCESS_NETWORK_STATE com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE com.huawei.android.launcher.permission.READ_SETTINGS android.permission.READ_SMS android.permission.PROCESS_INCOMING_CALLS Result
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0

3 rows × 87 columns

In [3]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29332 entries, 0 to 29331
Data columns (total 87 columns):
 #   Column                                                                         Non-Null Count  Dtype
---  ------                                                                         --------------  -----
 0   android.permission.GET_ACCOUNTS                                                29332 non-null  int64
 1   com.sonyericsson.home.permission.BROADCAST_BADGE                               29332 non-null  int64
 2   android.permission.READ_PROFILE                                                29332 non-null  int64
 3   android.permission.MANAGE_ACCOUNTS                                             29332 non-null  int64
 4   android.permission.WRITE_SYNC_SETTINGS                                         29332 non-null  int64
 5   android.permission.READ_EXTERNAL_STORAGE                                       29332 non-null  int64
 6   android.permission.RECEIVE_SMS                                                 29332 non-null  int64
 7   com.android.launcher.permission.READ_SETTINGS                                  29332 non-null  int64
 8   android.permission.WRITE_SETTINGS                                              29332 non-null  int64
 9   com.google.android.providers.gsf.permission.READ_GSERVICES                     29332 non-null  int64
 10  android.permission.DOWNLOAD_WITHOUT_NOTIFICATION                               29332 non-null  int64
 11  android.permission.GET_TASKS                                                   29332 non-null  int64
 12  android.permission.WRITE_EXTERNAL_STORAGE                                      29332 non-null  int64
 13  android.permission.RECORD_AUDIO                                                29332 non-null  int64
 14  com.huawei.android.launcher.permission.CHANGE_BADGE                            29332 non-null  int64
 15  com.oppo.launcher.permission.READ_SETTINGS                                     29332 non-null  int64
 16  android.permission.CHANGE_NETWORK_STATE                                        29332 non-null  int64
 17  com.android.launcher.permission.INSTALL_SHORTCUT                               29332 non-null  int64
 18  android.permission.android.permission.READ_PHONE_STATE                         29332 non-null  int64
 19  android.permission.CALL_PHONE                                                  29332 non-null  int64
 20  android.permission.WRITE_CONTACTS                                              29332 non-null  int64
 21  android.permission.READ_PHONE_STATE                                            29332 non-null  int64
 22  com.samsung.android.providers.context.permission.WRITE_USE_APP_FEATURE_SURVEY  29332 non-null  int64
 23  android.permission.MODIFY_AUDIO_SETTINGS                                       29332 non-null  int64
 24  android.permission.ACCESS_LOCATION_EXTRA_COMMANDS                              29332 non-null  int64
 25  android.permission.INTERNET                                                    29332 non-null  int64
 26  android.permission.MOUNT_UNMOUNT_FILESYSTEMS                                   29332 non-null  int64
 27  com.majeur.launcher.permission.UPDATE_BADGE                                    29332 non-null  int64
 28  android.permission.AUTHENTICATE_ACCOUNTS                                       29332 non-null  int64
 29  com.htc.launcher.permission.READ_SETTINGS                                      29332 non-null  int64
 30  android.permission.ACCESS_WIFI_STATE                                           29332 non-null  int64
 31  android.permission.FLASHLIGHT                                                  29332 non-null  int64
 32  android.permission.READ_APP_BADGE                                              29332 non-null  int64
 33  android.permission.USE_CREDENTIALS                                             29332 non-null  int64
 34  android.permission.CHANGE_CONFIGURATION                                        29332 non-null  int64
 35  android.permission.READ_SYNC_SETTINGS                                          29332 non-null  int64
 36  android.permission.BROADCAST_STICKY                                            29332 non-null  int64
 37  com.anddoes.launcher.permission.UPDATE_COUNT                                   29332 non-null  int64
 38  com.android.alarm.permission.SET_ALARM                                         29332 non-null  int64
 39  com.google.android.c2dm.permission.RECEIVE                                     29332 non-null  int64
 40  android.permission.KILL_BACKGROUND_PROCESSES                                   29332 non-null  int64
 41  com.sonymobile.home.permission.PROVIDER_INSERT_BADGE                           29332 non-null  int64
 42  com.sec.android.provider.badge.permission.READ                                 29332 non-null  int64
 43  android.permission.WRITE_CALENDAR                                              29332 non-null  int64
 44  android.permission.SEND_SMS                                                    29332 non-null  int64
 45  com.huawei.android.launcher.permission.WRITE_SETTINGS                          29332 non-null  int64
 46  android.permission.REQUEST_INSTALL_PACKAGES                                    29332 non-null  int64
 47  android.permission.SET_WALLPAPER_HINTS                                         29332 non-null  int64
 48  android.permission.SET_WALLPAPER                                               29332 non-null  int64
 49  com.oppo.launcher.permission.WRITE_SETTINGS                                    29332 non-null  int64
 50  android.permission.RESTART_PACKAGES                                            29332 non-null  int64
 51  me.everything.badger.permission.BADGE_COUNT_WRITE                              29332 non-null  int64
 52  android.permission.ACCESS_MOCK_LOCATION                                        29332 non-null  int64
 53  android.permission.ACCESS_COARSE_LOCATION                                      29332 non-null  int64
 54  android.permission.READ_LOGS                                                   29332 non-null  int64
 55  com.google.android.gms.permission.ACTIVITY_RECOGNITION                         29332 non-null  int64
 56  com.amazon.device.messaging.permission.RECEIVE                                 29332 non-null  int64
 57  android.permission.SYSTEM_ALERT_WINDOW                                         29332 non-null  int64
 58  android.permission.DISABLE_KEYGUARD                                            29332 non-null  int64
 59  android.permission.USE_FINGERPRINT                                             29332 non-null  int64
 60  me.everything.badger.permission.BADGE_COUNT_READ                               29332 non-null  int64
 61  android.permission.CHANGE_WIFI_STATE                                           29332 non-null  int64
 62  android.permission.READ_CONTACTS                                               29332 non-null  int64
 63  com.android.vending.BILLING                                                    29332 non-null  int64
 64  android.permission.READ_CALENDAR                                               29332 non-null  int64
 65  android.permission.RECEIVE_BOOT_COMPLETED                                      29332 non-null  int64
 66  android.permission.WAKE_LOCK                                                   29332 non-null  int64
 67  android.permission.ACCESS_FINE_LOCATION                                        29332 non-null  int64
 68  android.permission.BLUETOOTH                                                   29332 non-null  int64
 69  android.permission.CAMERA                                                      29332 non-null  int64
 70  com.android.vending.CHECK_LICENSE                                              29332 non-null  int64
 71  android.permission.FOREGROUND_SERVICE                                          29332 non-null  int64
 72  android.permission.BLUETOOTH_ADMIN                                             29332 non-null  int64
 73  android.permission.VIBRATE                                                     29332 non-null  int64
 74  android.permission.NFC                                                         29332 non-null  int64
 75  android.permission.RECEIVE_USER_PRESENT                                        29332 non-null  int64
 76  android.permission.CLEAR_APP_CACHE                                             29332 non-null  int64
 77  com.android.launcher.permission.UNINSTALL_SHORTCUT                             29332 non-null  int64
 78  com.sec.android.iap.permission.BILLING                                         29332 non-null  int64
 79  com.htc.launcher.permission.UPDATE_SHORTCUT                                    29332 non-null  int64
 80  com.sec.android.provider.badge.permission.WRITE                                29332 non-null  int64
 81  android.permission.ACCESS_NETWORK_STATE                                        29332 non-null  int64
 82  com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE         29332 non-null  int64
 83  com.huawei.android.launcher.permission.READ_SETTINGS                           29332 non-null  int64
 84  android.permission.READ_SMS                                                    29332 non-null  int64
 85  android.permission.PROCESS_INCOMING_CALLS                                      29332 non-null  int64
 86  Result                                                                         29332 non-null  int64
dtypes: int64(87)
memory usage: 19.5 MB

All the columns have 2 unique values. 0 or 1

In [4]:
d = {}
for i in data.columns:
    d[i] = len(data[i].unique())
pd.DataFrame(d, index=[0])
Out[4]:
android.permission.GET_ACCOUNTS com.sonyericsson.home.permission.BROADCAST_BADGE android.permission.READ_PROFILE android.permission.MANAGE_ACCOUNTS android.permission.WRITE_SYNC_SETTINGS android.permission.READ_EXTERNAL_STORAGE android.permission.RECEIVE_SMS com.android.launcher.permission.READ_SETTINGS android.permission.WRITE_SETTINGS com.google.android.providers.gsf.permission.READ_GSERVICES ... com.android.launcher.permission.UNINSTALL_SHORTCUT com.sec.android.iap.permission.BILLING com.htc.launcher.permission.UPDATE_SHORTCUT com.sec.android.provider.badge.permission.WRITE android.permission.ACCESS_NETWORK_STATE com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE com.huawei.android.launcher.permission.READ_SETTINGS android.permission.READ_SMS android.permission.PROCESS_INCOMING_CALLS Result
0 2 2 2 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2 2 2 2

1 rows × 87 columns

In [5]:
print(data['Result'].value_counts())
plt.figure(figsize=(7, 5), dpi=100)
sns.countplot(data['Result']);
1    14700
0    14632
Name: Result, dtype: int64

Dataset is well balanced

In [6]:
pd.DataFrame(data.isnull().sum()).T
Out[6]:
android.permission.GET_ACCOUNTS com.sonyericsson.home.permission.BROADCAST_BADGE android.permission.READ_PROFILE android.permission.MANAGE_ACCOUNTS android.permission.WRITE_SYNC_SETTINGS android.permission.READ_EXTERNAL_STORAGE android.permission.RECEIVE_SMS com.android.launcher.permission.READ_SETTINGS android.permission.WRITE_SETTINGS com.google.android.providers.gsf.permission.READ_GSERVICES ... com.android.launcher.permission.UNINSTALL_SHORTCUT com.sec.android.iap.permission.BILLING com.htc.launcher.permission.UPDATE_SHORTCUT com.sec.android.provider.badge.permission.WRITE android.permission.ACCESS_NETWORK_STATE com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE com.huawei.android.launcher.permission.READ_SETTINGS android.permission.READ_SMS android.permission.PROCESS_INCOMING_CALLS Result
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 87 columns

In [7]:
from random import random


X = data[data.columns[:-1]]
y = data['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.15,
                                                    stratify=y,
                                                    random_state = 121)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[7]:
((24932, 86), (4400, 86), (24932,), (4400,))

Multi-collinearity before feature selection¶

In [8]:
# Before feature selection.
plt.figure(figsize=(8, 5), dpi=100)
sns.heatmap(X.corr(), cmap='Purples', xticklabels=False, yticklabels=False);

Machine learning modelling with default parameters.¶

In [9]:
#logistic regression (no tuning - no feature selection - only defaults)
def logistic(xtrain, ytrain, xtest, ytest):
    print("Logistic Regression --->\n")
    log_reg = LogisticRegression().fit(xtrain, ytrain)
    log_reg_pred = log_reg.predict(xtest)
    print(f"F1-Score : {f1_score(ytest, log_reg_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, log_reg.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, log_reg_pred))

logistic(X_train, y_train, X_test, y_test)
Logistic Regression --->

F1-Score : 0.956
AUC - ROC Score : 0.985

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2195
           1       0.95      0.96      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [10]:
# Support vector machines (no tuning - no feature selection - only defaults)

def support_vector(xtrain, ytrain, xtest, ytest):
    print("Support Vector Machines --->\n")
    svm = SVC(probability=True).fit(xtrain, ytrain)
    svm_pred = svm.predict(xtest)
    print(f"F1-Score : {f1_score(ytest, svm_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, svm.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, svm_pred))

support_vector(X_train, y_train, X_test, y_test)
Support Vector Machines --->

F1-Score : 0.961
AUC - ROC Score : 0.989

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2195
           1       0.96      0.97      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [11]:
# Knn (no tuning - no feature selection - only defaults)

def knn(xtrain, ytrain, xtest, ytest):
    print("K Nearest Neighbors --->\n")
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(xtrain, ytrain)
    neigh_pred = neigh.predict(xtest)
    print(f"F1-Score : {f1_score(ytest, neigh_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, neigh.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, neigh_pred))

knn(X_train, y_train, X_test, y_test)
K Nearest Neighbors --->

F1-Score : 0.959
AUC - ROC Score : 0.981

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2195
           1       0.95      0.97      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [12]:
# Random forests (no tuning - no feature selection - only defaults)

def random_forest(xtrain, ytrain, xtest, ytest):
    print("Random Forests --->\n")
    rf = RandomForestClassifier(n_estimators=300, random_state=0)
    rf.fit(xtrain, ytrain)
    rf_pred = rf.predict(xtest)
    print(f"F1-Score : {f1_score(ytest, rf_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, rf.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, rf_pred))

random_forest(X_train, y_train, X_test, y_test)
Random Forests --->

F1-Score : 0.969
AUC - ROC Score : 0.992

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2195
           1       0.97      0.97      0.97      2205

    accuracy                           0.97      4400
   macro avg       0.97      0.97      0.97      4400
weighted avg       0.97      0.97      0.97      4400

In [13]:
# XGBoost (no tuning - no feature selection - only defaults)

def xgboost_clf(xtrain, ytrain, xtest, ytest):
    print("XGBoost --->\n")
    xg = XGBClassifier(eval_metric='auc', random_state=101).fit(xtrain, ytrain)
    xg_pred = xg.predict(xtest)
    print(f"F1-Score : {f1_score(ytest, xg_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, xg.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, xg_pred))

xgboost_clf(X_train, y_train, X_test, y_test)
XGBoost --->

F1-Score : 0.965
AUC - ROC Score : 0.992

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2195
           1       0.97      0.96      0.97      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.97      0.96      4400
weighted avg       0.97      0.96      0.97      4400

In [14]:
# CatBoost (no tuning - no feature selection - only defaults)

def catboost_clf(xtrain, ytrain, xtest, ytest):
    print("CatBoost --->\n")
    cat = CatBoostClassifier(loss_function='Logloss',
                             verbose=False)

    cat.fit(X_train, y_train, plot=True)
    test = Pool(xtest, ytest)
    cat_pred = cat.predict(test)
    print(f"F1-Score : {f1_score(ytest, cat_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, cat.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, cat_pred))

catboost_clf(X_train, y_train, X_test, y_test)
CatBoost --->

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
F1-Score : 0.966
AUC - ROC Score : 0.993

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2195
           1       0.97      0.97      0.97      2205

    accuracy                           0.97      4400
   macro avg       0.97      0.97      0.97      4400
weighted avg       0.97      0.97      0.97      4400

Voting classifier --> Random forest + XGBoost + CatBoost¶
In [16]:
# Voting classifier - Random Forest, XGBoost and CatBoost

def vote_clf(xtrain, ytrain, xtest, ytest):
    print("Voting Classifier --->\n")
    ensemble = VotingClassifier(
        estimators=[
            ("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
            ("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
            ("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
        ],
        voting="soft",
        n_jobs=-1
    )

    ensemble.fit(xtrain, ytrain)
    e_pred = ensemble.predict(xtest)
    print(f"\nF1-Score : {f1_score(ytest, e_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, ensemble.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, e_pred))

vote_clf(X_train, y_train, X_test, y_test)
Voting Classifier --->


F1-Score : 0.967
AUC - ROC Score : 0.993

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2195
           1       0.97      0.96      0.97      2205

    accuracy                           0.97      4400
   macro avg       0.97      0.97      0.97      4400
weighted avg       0.97      0.97      0.97      4400

Stacking classifier --> Logistic regression + SVM + Knn + Random forest + XGBoost + CatBoost¶
In [17]:
# Stacking classifier - Logistic Regression, SVM, KNN, Random Forest, XGBoost and CatBoost.

def stacking_clf(xtrain, ytrain, xtest, ytest):
    print("Stacking Classifier --->\n")
    ensemble = StackingClassifier(
        estimators=[
            ("LR", LogisticRegression()),
            ("SVM", SVC(probability=True)),
            ("KNN", KNeighborsClassifier(n_neighbors=5)),
            ("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
            ("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
            ("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
        ],
        final_estimator=RandomForestClassifier(n_estimators=300, random_state=0),
        cv=5,
        passthrough=False,
        n_jobs=-1
    )

    ensemble.fit(xtrain, ytrain)
    e_pred = ensemble.predict(xtest)
    print(f"F1-Score : {f1_score(ytest, e_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(ytest, ensemble.predict_proba(xtest)[:, 1]):.3f}", end="\n\n")
    print(classification_report(ytest, e_pred))

stacking_clf(X_train, y_train, X_test, y_test)
Stacking Classifier --->

F1-Score : 0.967
AUC - ROC Score : 0.992

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2195
           1       0.97      0.96      0.97      2205

    accuracy                           0.97      4400
   macro avg       0.97      0.97      0.97      4400
weighted avg       0.97      0.97      0.97      4400

1. Feature selection - Variance Inflation Factor (remove multicollinearity)¶

Background elimination of each feature with VIF > 5¶

In [59]:
# Handle multicollinearity by eliminating features based on Variance Inflation Factor (VIF).

def VIF(x):
    cols_remove = []
    while True:
        temp_x = X.drop(cols_remove, axis=1)
        vf = pd.DataFrame()
        vf['columns'] = temp_x.columns
        vf['VIF score'] = [variance_inflation_factor(temp_x.values, i) for i in range(len(temp_x.columns))]
        vf.sort_values(by='VIF score', ascending=False, inplace=True)
        vf.reset_index(drop=True, inplace=True)
        if vf.loc[0, 'VIF score'] > 5:
            print(vf.loc[0, 'columns'], vf.loc[0, 'VIF score'])
            cols_remove.append(vf.loc[0, 'columns'])
        else:
            break

    return cols_remove
Columns to be removed.¶
In [60]:
print("Columns to be removed (VIF > 5) --->")
cols_remove = VIF(X)
Columns to be removed (VIF > 6) --->
me.everything.badger.permission.BADGE_COUNT_WRITE inf
com.sec.android.provider.badge.permission.READ 544.4516399895751
com.oppo.launcher.permission.WRITE_SETTINGS 186.73742628488154
com.huawei.android.launcher.permission.WRITE_SETTINGS 94.1855996984245
com.htc.launcher.permission.UPDATE_SHORTCUT 90.02778090329161
com.anddoes.launcher.permission.UPDATE_COUNT 78.72204740606233
com.huawei.android.launcher.permission.CHANGE_BADGE 42.24730372097385
android.permission.ACCESS_NETWORK_STATE 39.67262119636497
android.permission.READ_APP_BADGE 31.921291142563234
com.majeur.launcher.permission.UPDATE_BADGE 27.942467281595935
com.huawei.android.launcher.permission.READ_SETTINGS 20.40800554077491
com.sonyericsson.home.permission.BROADCAST_BADGE 19.716629104817088
android.permission.ACCESS_COARSE_LOCATION 9.442415097881028
android.permission.READ_CALENDAR 8.319298090331598
com.sec.android.provider.badge.permission.WRITE 7.58175517934947
com.oppo.launcher.permission.READ_SETTINGS 7.230427332697641
android.permission.INTERNET 5.339439869414774
android.permission.RECEIVE_SMS 5.147512049622763
android.permission.READ_PHONE_STATE 5.015054263933838
Heatmap after removing multi-collinearity¶
In [76]:
plt.figure(figsize=(8, 5), dpi=100)
sns.heatmap(X.drop(cols_remove, axis=1).corr(), cmap='Purples', xticklabels=False, yticklabels=False);
From this point all machine learning models will be executed through pipelines.¶
In [63]:
# Function to remove columns based on VIF
def remove_columns_vif(x):
    x = x.drop(cols_remove, axis=1)
    return x

remove_col_transformer = FunctionTransformer(remove_columns_vif)

def eval_classifier(name, pipeline):
    print(f"{name} --->")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"\nF1-Score : {f1_score(y_test, y_pred):.3f}")
    print(f"AUC - ROC Score : {roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]):.3f}", end="\n\n")
    print(classification_report(y_test, y_pred))
In [64]:
# Random forests - after feature selection - no tuning.

rf_pipeline = Pipeline(
    steps=[
        ("Feature selection", remove_col_transformer),
        ("Random forest classifier", RandomForestClassifier(n_estimators=300, random_state=0))
    ]
)
rf_pipeline
Out[64]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Random forest classifier',
                 RandomForestClassifier(n_estimators=300, random_state=0))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Random forest classifier',
                 RandomForestClassifier(n_estimators=300, random_state=0))])
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
RandomForestClassifier(n_estimators=300, random_state=0)
In [65]:
eval_classifier("Random forest pipeline", rf_pipeline)
Random forest pipeline --->

F1-Score : 0.943
AUC - ROC Score : 0.984

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2195
           1       0.94      0.94      0.94      2205

    accuracy                           0.94      4400
   macro avg       0.94      0.94      0.94      4400
weighted avg       0.94      0.94      0.94      4400

In [66]:
xg_pipeline = Pipeline(
    steps=[
        ("Feature selection", remove_col_transformer),
        ("Random forest classifier", XGBClassifier(eval_metric='auc', random_state=101))
    ]
)
xg_pipeline
Out[66]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Random forest classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='auc',
                               gamma=None, gpu_id...policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=101, reg_alpha=None,
                               reg_lambda=None, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Random forest classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='auc',
                               gamma=None, gpu_id...policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=101, reg_alpha=None,
                               reg_lambda=None, ...))])
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=101,
              reg_alpha=None, reg_lambda=None, ...)
In [67]:
eval_classifier("XGBoost classifier", xg_pipeline)
XGBoost classifier --->

F1-Score : 0.945
AUC - ROC Score : 0.985

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2195
           1       0.97      0.92      0.94      2205

    accuracy                           0.95      4400
   macro avg       0.95      0.95      0.95      4400
weighted avg       0.95      0.95      0.95      4400

In [68]:
cat_pipeline = Pipeline(
    steps=[
        ("Feature selection", remove_col_transformer),
        ("CatBoost classifier", CatBoostClassifier(loss_function='Logloss', verbose=False))
    ]
)
cat_pipeline
Out[68]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('CatBoost classifier',
                 <catboost.core.CatBoostClassifier object at 0x000002F486336D60>)])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('CatBoost classifier',
                 <catboost.core.CatBoostClassifier object at 0x000002F486336D60>)])
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
<catboost.core.CatBoostClassifier object at 0x000002F486336D60>
In [69]:
eval_classifier("CatBoost classifier", cat_pipeline)
CatBoost classifier --->

F1-Score : 0.946
AUC - ROC Score : 0.986

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2195
           1       0.97      0.92      0.95      2205

    accuracy                           0.95      4400
   macro avg       0.95      0.95      0.95      4400
weighted avg       0.95      0.95      0.95      4400

In [70]:
voting_pipeline = Pipeline(
    steps=[
        ("Feature selection", remove_col_transformer),
        ("Voting classifier", VotingClassifier(
                                                estimators=[
                                                    ("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
                                                    ("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
                                                    ("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
                                                ],
                                                voting="soft",
                                                n_jobs=-1
                                            ))
    ]
)
voting_pipeline
Out[70]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Voting classifier',
                 VotingClassifier(estimators=[('random_forest',
                                               RandomForestClassifier(n_estimators=300,
                                                                      random_state=0)),
                                              ('xgb',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             callbacks=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsa...
                                                             max_cat_to_onehot=None,
                                                             max_delta_step=None,
                                                             max_depth=None,
                                                             max_leaves=None,
                                                             min_child_weight=None,
                                                             missing=nan,
                                                             monotone_constraints=None,
                                                             n_estimators=100,
                                                             n_jobs=None,
                                                             num_parallel_tree=None,
                                                             predictor=None,
                                                             random_state=101,
                                                             reg_alpha=None,
                                                             reg_lambda=None, ...)),
                                              ('catboost',
                                               <catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>)],
                                  n_jobs=-1, voting='soft'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Voting classifier',
                 VotingClassifier(estimators=[('random_forest',
                                               RandomForestClassifier(n_estimators=300,
                                                                      random_state=0)),
                                              ('xgb',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             callbacks=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsa...
                                                             max_cat_to_onehot=None,
                                                             max_delta_step=None,
                                                             max_depth=None,
                                                             max_leaves=None,
                                                             min_child_weight=None,
                                                             missing=nan,
                                                             monotone_constraints=None,
                                                             n_estimators=100,
                                                             n_jobs=None,
                                                             num_parallel_tree=None,
                                                             predictor=None,
                                                             random_state=101,
                                                             reg_alpha=None,
                                                             reg_lambda=None, ...)),
                                              ('catboost',
                                               <catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>)],
                                  n_jobs=-1, voting='soft'))])
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(n_estimators=300,
                                                     random_state=0)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric='auc', gamma=None,
                                            gpu_id=None, grow_policy=None,
                                            impor...
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=None,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            predictor=None, random_state=101,
                                            reg_alpha=None, reg_lambda=None, ...)),
                             ('catboost',
                              <catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>)],
                 n_jobs=-1, voting='soft')
RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=101,
              reg_alpha=None, reg_lambda=None, ...)
<catboost.core.CatBoostClassifier object at 0x000002F48A6CF370>
In [71]:
eval_classifier("Voting classifier", voting_pipeline)
Voting classifier --->

F1-Score : 0.946
AUC - ROC Score : 0.986

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2195
           1       0.97      0.93      0.95      2205

    accuracy                           0.95      4400
   macro avg       0.95      0.95      0.95      4400
weighted avg       0.95      0.95      0.95      4400

In [74]:
stacking_pipeline = Pipeline(
    steps=[
        ("Feature selection", remove_col_transformer),
        ("Stacking classifier", StackingClassifier(
                                                    estimators=[
                                                        ("LR", LogisticRegression()),
                                                        ("SVM", SVC(probability=True)),
                                                        ("KNN", KNeighborsClassifier(n_neighbors=5)),
                                                        ("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
                                                        ("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
                                                        ("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
                                                    ],
                                                    final_estimator=CatBoostClassifier(loss_function='Logloss', verbose=False),
                                                    cv=5,
                                                    passthrough=False,
                                                    n_jobs=-1
                                                ))
    ]
)
stacking_pipeline
Out[74]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Stacking classifier',
                 StackingClassifier(cv=5,
                                    estimators=[('LR', LogisticRegression()),
                                                ('SVM', SVC(probability=True)),
                                                ('KNN', KNeighborsClassifier()),
                                                ('random_forest',
                                                 RandomForestClassifier(n_estimators=300,
                                                                        random_state=0)),
                                                ('xgb',
                                                 XGBClassif...
                                                               max_leaves=None,
                                                               min_child_weight=None,
                                                               missing=nan,
                                                               monotone_constraints=None,
                                                               n_estimators=100,
                                                               n_jobs=None,
                                                               num_parallel_tree=None,
                                                               predictor=None,
                                                               random_state=101,
                                                               reg_alpha=None,
                                                               reg_lambda=None, ...)),
                                                ('catboost',
                                                 <catboost.core.CatBoostClassifier object at 0x000002F48820E760>)],
                                    final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48820E610>,
                                    n_jobs=-1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)),
                ('Stacking classifier',
                 StackingClassifier(cv=5,
                                    estimators=[('LR', LogisticRegression()),
                                                ('SVM', SVC(probability=True)),
                                                ('KNN', KNeighborsClassifier()),
                                                ('random_forest',
                                                 RandomForestClassifier(n_estimators=300,
                                                                        random_state=0)),
                                                ('xgb',
                                                 XGBClassif...
                                                               max_leaves=None,
                                                               min_child_weight=None,
                                                               missing=nan,
                                                               monotone_constraints=None,
                                                               n_estimators=100,
                                                               n_jobs=None,
                                                               num_parallel_tree=None,
                                                               predictor=None,
                                                               random_state=101,
                                                               reg_alpha=None,
                                                               reg_lambda=None, ...)),
                                                ('catboost',
                                                 <catboost.core.CatBoostClassifier object at 0x000002F48820E760>)],
                                    final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48820E610>,
                                    n_jobs=-1))])
FunctionTransformer(func=<function remove_columns_vif at 0x000002F4759101F0>)
StackingClassifier(cv=5,
                   estimators=[('LR', LogisticRegression()),
                               ('SVM', SVC(probability=True)),
                               ('KNN', KNeighborsClassifier()),
                               ('random_forest',
                                RandomForestClassifier(n_estimators=300,
                                                       random_state=0)),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              early_stopping_roun...
                                              max_depth=None, max_leaves=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=100, n_jobs=None,
                                              num_parallel_tree=None,
                                              predictor=None, random_state=101,
                                              reg_alpha=None, reg_lambda=None, ...)),
                               ('catboost',
                                <catboost.core.CatBoostClassifier object at 0x000002F48820E760>)],
                   final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48820E610>,
                   n_jobs=-1)
LogisticRegression()
SVC(probability=True)
KNeighborsClassifier()
RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=101,
              reg_alpha=None, reg_lambda=None, ...)
<catboost.core.CatBoostClassifier object at 0x000002F48820E760>
<catboost.core.CatBoostClassifier object at 0x000002F48820E610>
In [75]:
eval_classifier("Stacking classifier", stacking_pipeline)
Stacking classifier --->

F1-Score : 0.942
AUC - ROC Score : 0.986

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      2195
           1       0.95      0.94      0.94      2205

    accuracy                           0.94      4400
   macro avg       0.94      0.94      0.94      4400
weighted avg       0.94      0.94      0.94      4400

2. Feature selection - Mutual Information Score¶

In [8]:
mi_score = MIC(X, y)
In [9]:
mi_cols_remove_ind = np.where(mi_score < 0.001)
mi_cols_remove = X.columns[mi_cols_remove_ind[0]]
mi_cols_remove
Out[9]:
Index(['android.permission.WRITE_SYNC_SETTINGS',
       'android.permission.AUTHENTICATE_ACCOUNTS',
       'android.permission.FLASHLIGHT',
       'android.permission.READ_SYNC_SETTINGS',
       'android.permission.BROADCAST_STICKY',
       'android.permission.SET_WALLPAPER_HINTS',
       'android.permission.ACCESS_MOCK_LOCATION',
       'com.google.android.gms.permission.ACTIVITY_RECOGNITION',
       'com.android.vending.CHECK_LICENSE', 'android.permission.VIBRATE',
       'android.permission.RECEIVE_USER_PRESENT'],
      dtype='object')
In [113]:
def MI_remover(x):
    x = x.drop(mi_cols_remove, axis=1)
    return x

mi_remove_col_transformer = FunctionTransformer(MI_remover)
In [114]:
rf_pipeline = Pipeline(
    steps=[
        ("Feature selection", mi_remove_col_transformer),
        ("Random forest classifier", RandomForestClassifier(n_estimators=300, random_state=0))
    ]
)
rf_pipeline
Out[114]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
                ('Random forest classifier',
                 RandomForestClassifier(n_estimators=300, random_state=0))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
                ('Random forest classifier',
                 RandomForestClassifier(n_estimators=300, random_state=0))])
FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)
RandomForestClassifier(n_estimators=300, random_state=0)
In [115]:
eval_classifier("Random forest pipeline", rf_pipeline)
Random forest pipeline --->

F1-Score : 0.965
AUC - ROC Score : 0.991

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2195
           1       0.97      0.96      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [116]:
xg_pipeline = Pipeline(
    steps=[
        ("Feature selection", mi_remove_col_transformer),
        ("Random forest classifier", XGBClassifier(eval_metric='auc', random_state=101))
    ]
)
xg_pipeline
Out[116]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
                ('Random forest classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='auc',
                               gamma=None, gpu_id=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=101, reg_alpha=None,
                               reg_lambda=None, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)),
                ('Random forest classifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric='auc',
                               gamma=None, gpu_id=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_to_onehot=None,
                               max_delta_step=None, max_depth=None,
                               max_leaves=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=100, n_jobs=None,
                               num_parallel_tree=None, predictor=None,
                               random_state=101, reg_alpha=None,
                               reg_lambda=None, ...))])
FunctionTransformer(func=<function MI_remover at 0x000002F487434D30>)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=101,
              reg_alpha=None, reg_lambda=None, ...)
In [117]:
eval_classifier("XGBoost classifier", xg_pipeline)
XGBoost classifier --->

F1-Score : 0.963
AUC - ROC Score : 0.991

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2195
           1       0.97      0.96      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [97]:
cat_pipeline = Pipeline(
    steps=[
        ("Feature selection", mi_remove_col_transformer),
        ("CatBoost classifier", CatBoostClassifier(loss_function='Logloss', verbose=False))
    ]
)
cat_pipeline
Out[97]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
                ('CatBoost classifier',
                 <catboost.core.CatBoostClassifier object at 0x000002F4874429A0>)])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
                ('CatBoost classifier',
                 <catboost.core.CatBoostClassifier object at 0x000002F4874429A0>)])
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)
<catboost.core.CatBoostClassifier object at 0x000002F4874429A0>
In [98]:
eval_classifier("CatBoost classifier", cat_pipeline)
CatBoost classifier --->

F1-Score : 0.963
AUC - ROC Score : 0.993

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2195
           1       0.96      0.97      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [99]:
voting_pipeline = Pipeline(
    steps=[
        ("Feature selection", mi_remove_col_transformer),
        ("Voting classifier", VotingClassifier(
                                                estimators=[
                                                    ("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
                                                    ("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
                                                    ("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
                                                ],
                                                voting="soft",
                                                n_jobs=-1
                                            ))
    ]
)
voting_pipeline
Out[99]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
                ('Voting classifier',
                 VotingClassifier(estimators=[('random_forest',
                                               RandomForestClassifier(n_estimators=300,
                                                                      random_state=0)),
                                              ('xgb',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             callbacks=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_byt...
                                                             max_cat_to_onehot=None,
                                                             max_delta_step=None,
                                                             max_depth=None,
                                                             max_leaves=None,
                                                             min_child_weight=None,
                                                             missing=nan,
                                                             monotone_constraints=None,
                                                             n_estimators=100,
                                                             n_jobs=None,
                                                             num_parallel_tree=None,
                                                             predictor=None,
                                                             random_state=101,
                                                             reg_alpha=None,
                                                             reg_lambda=None, ...)),
                                              ('catboost',
                                               <catboost.core.CatBoostClassifier object at 0x000002F487442400>)],
                                  n_jobs=-1, voting='soft'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
                ('Voting classifier',
                 VotingClassifier(estimators=[('random_forest',
                                               RandomForestClassifier(n_estimators=300,
                                                                      random_state=0)),
                                              ('xgb',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             callbacks=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             colsample_byt...
                                                             max_cat_to_onehot=None,
                                                             max_delta_step=None,
                                                             max_depth=None,
                                                             max_leaves=None,
                                                             min_child_weight=None,
                                                             missing=nan,
                                                             monotone_constraints=None,
                                                             n_estimators=100,
                                                             n_jobs=None,
                                                             num_parallel_tree=None,
                                                             predictor=None,
                                                             random_state=101,
                                                             reg_alpha=None,
                                                             reg_lambda=None, ...)),
                                              ('catboost',
                                               <catboost.core.CatBoostClassifier object at 0x000002F487442400>)],
                                  n_jobs=-1, voting='soft'))])
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)
VotingClassifier(estimators=[('random_forest',
                              RandomForestClassifier(n_estimators=300,
                                                     random_state=0)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric='auc', gamma=None,
                                            gpu_id=None, grow_policy=None,
                                            impor...
                                            max_cat_to_onehot=None,
                                            max_delta_step=None, max_depth=None,
                                            max_leaves=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            predictor=None, random_state=101,
                                            reg_alpha=None, reg_lambda=None, ...)),
                             ('catboost',
                              <catboost.core.CatBoostClassifier object at 0x000002F487442400>)],
                 n_jobs=-1, voting='soft')
RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=101,
              reg_alpha=None, reg_lambda=None, ...)
<catboost.core.CatBoostClassifier object at 0x000002F487442400>
In [100]:
eval_classifier("Voting classifier", voting_pipeline)
Voting classifier --->

F1-Score : 0.964
AUC - ROC Score : 0.993

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2195
           1       0.97      0.96      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

In [103]:
stacking_pipeline = Pipeline(
    steps=[
        ("Feature selection", mi_remove_col_transformer),
        ("Stacking classifier", StackingClassifier(
                                                    estimators=[
                                                        ("LR", LogisticRegression()),
                                                        ("SVM", SVC(probability=True)),
                                                        ("KNN", KNeighborsClassifier(n_neighbors=5)),
                                                        ("random_forest", RandomForestClassifier(n_estimators=300, random_state=0)),
                                                        ("xgb", XGBClassifier(eval_metric='auc', random_state=101)),
                                                        ("catboost", CatBoostClassifier(loss_function='Logloss', verbose=False))
                                                    ],
                                                    final_estimator=CatBoostClassifier(loss_function='Logloss', verbose=False),
                                                    cv=5,
                                                    passthrough=False,
                                                    n_jobs=-1
                                                ))
    ]
)
stacking_pipeline
Out[103]:
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
                ('Stacking classifier',
                 StackingClassifier(cv=5,
                                    estimators=[('LR', LogisticRegression()),
                                                ('SVM', SVC(probability=True)),
                                                ('KNN', KNeighborsClassifier()),
                                                ('random_forest',
                                                 RandomForestClassifier(n_estimators=300,
                                                                        random_state=0)),
                                                ('xgb',
                                                 XGBClassifier(base...
                                                               max_leaves=None,
                                                               min_child_weight=None,
                                                               missing=nan,
                                                               monotone_constraints=None,
                                                               n_estimators=100,
                                                               n_jobs=None,
                                                               num_parallel_tree=None,
                                                               predictor=None,
                                                               random_state=101,
                                                               reg_alpha=None,
                                                               reg_lambda=None, ...)),
                                                ('catboost',
                                                 <catboost.core.CatBoostClassifier object at 0x000002F48738A970>)],
                                    final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>,
                                    n_jobs=-1))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('Feature selection',
                 FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)),
                ('Stacking classifier',
                 StackingClassifier(cv=5,
                                    estimators=[('LR', LogisticRegression()),
                                                ('SVM', SVC(probability=True)),
                                                ('KNN', KNeighborsClassifier()),
                                                ('random_forest',
                                                 RandomForestClassifier(n_estimators=300,
                                                                        random_state=0)),
                                                ('xgb',
                                                 XGBClassifier(base...
                                                               max_leaves=None,
                                                               min_child_weight=None,
                                                               missing=nan,
                                                               monotone_constraints=None,
                                                               n_estimators=100,
                                                               n_jobs=None,
                                                               num_parallel_tree=None,
                                                               predictor=None,
                                                               random_state=101,
                                                               reg_alpha=None,
                                                               reg_lambda=None, ...)),
                                                ('catboost',
                                                 <catboost.core.CatBoostClassifier object at 0x000002F48738A970>)],
                                    final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>,
                                    n_jobs=-1))])
FunctionTransformer(func=<function MI_remover at 0x000002F48633AE50>)
StackingClassifier(cv=5,
                   estimators=[('LR', LogisticRegression()),
                               ('SVM', SVC(probability=True)),
                               ('KNN', KNeighborsClassifier()),
                               ('random_forest',
                                RandomForestClassifier(n_estimators=300,
                                                       random_state=0)),
                               ('xgb',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              early_stopping_roun...
                                              max_depth=None, max_leaves=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              n_estimators=100, n_jobs=None,
                                              num_parallel_tree=None,
                                              predictor=None, random_state=101,
                                              reg_alpha=None, reg_lambda=None, ...)),
                               ('catboost',
                                <catboost.core.CatBoostClassifier object at 0x000002F48738A970>)],
                   final_estimator=<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>,
                   n_jobs=-1)
LogisticRegression()
SVC(probability=True)
KNeighborsClassifier()
RandomForestClassifier(n_estimators=300, random_state=0)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=101,
              reg_alpha=None, reg_lambda=None, ...)
<catboost.core.CatBoostClassifier object at 0x000002F48738A970>
<catboost.core.CatBoostClassifier object at 0x000002F48738AA30>
In [104]:
eval_classifier("Stacking classifier", stacking_pipeline)
Stacking classifier --->

F1-Score : 0.965
AUC - ROC Score : 0.993

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2195
           1       0.97      0.96      0.96      2205

    accuracy                           0.96      4400
   macro avg       0.96      0.96      0.96      4400
weighted avg       0.96      0.96      0.96      4400

Bayesian based hyper-parameter tuning for Random Forests¶

In [10]:
X_train_bayes = X_train.drop(mi_cols_remove, axis=1)
X_test_bayes = X_test.drop(mi_cols_remove, axis=1)
In [12]:
bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=0, n_jobs=-1),
    search_spaces={
        'n_estimators': (100, 200, 300, 400, 500, 1000),
        'criterion': ("gini", "entropy", "log_loss"),
        'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None),
        'min_samples_split': (2, 5, 10),
        'min_samples_leaf': (1, 2, 4),
        'max_features': ("sqrt", "log2"),
        'bootstrap': (True, False)
    },
    cv=3,
    n_jobs=-1,
    n_points=5,
    random_state=121
)
bayes_search
Out[12]:
BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
              n_jobs=-1, n_points=5, random_state=121,
              search_spaces={'bootstrap': (True, False),
                             'criterion': ('gini', 'entropy', 'log_loss'),
                             'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
                                           100, None),
                             'max_features': ('sqrt', 'log2'),
                             'min_samples_leaf': (1, 2, 4),
                             'min_samples_split': (2, 5, 10),
                             'n_estimators': (100, 200, 300, 400, 500, 1000)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
              n_jobs=-1, n_points=5, random_state=121,
              search_spaces={'bootstrap': (True, False),
                             'criterion': ('gini', 'entropy', 'log_loss'),
                             'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
                                           100, None),
                             'max_features': ('sqrt', 'log2'),
                             'min_samples_leaf': (1, 2, 4),
                             'min_samples_split': (2, 5, 10),
                             'n_estimators': (100, 200, 300, 400, 500, 1000)})
RandomForestClassifier(n_jobs=-1, random_state=0)
RandomForestClassifier(n_jobs=-1, random_state=0)
In [124]:
bayes_search.fit(X_train_bayes, y_train)
Out[124]:
BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
              n_jobs=-1, n_points=5, random_state=121,
              search_spaces={'bootstrap': (True, False),
                             'criterion': ('gini', 'entropy', 'log_loss'),
                             'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
                                           100, None),
                             'max_features': ('sqrt', 'log2'),
                             'min_samples_leaf': (1, 2, 4),
                             'min_samples_split': (2, 5, 10),
                             'n_estimators': (100, 200, 300, 400, 500, 1000)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BayesSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
              n_jobs=-1, n_points=5, random_state=121,
              search_spaces={'bootstrap': (True, False),
                             'criterion': ('gini', 'entropy', 'log_loss'),
                             'max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90,
                                           100, None),
                             'max_features': ('sqrt', 'log2'),
                             'min_samples_leaf': (1, 2, 4),
                             'min_samples_split': (2, 5, 10),
                             'n_estimators': (100, 200, 300, 400, 500, 1000)})
RandomForestClassifier(n_jobs=-1, random_state=0)
RandomForestClassifier(n_jobs=-1, random_state=0)
In [125]:
bayes_search.best_params_
Out[125]:
OrderedDict([('bootstrap', False),
             ('criterion', 'gini'),
             ('max_depth', 80),
             ('max_features', 'log2'),
             ('min_samples_leaf', 1),
             ('min_samples_split', 5),
             ('n_estimators', 400)])
In [11]:
rf_classifier = RandomForestClassifier( n_estimators=400,
                                        criterion='gini',
                                        max_depth=80,
                                        max_features='log2',
                                        min_samples_leaf=1,
                                        min_samples_split=5,
                                        bootstrap=False,
                                        random_state=0)
In [12]:
rf_classifier.fit(X_train_bayes, y_train)
y_pred = rf_classifier.predict(X_test_bayes)
print(f"\nF1-Score : {f1_score(y_test, y_pred):.3f}")
print(f"AUC - ROC Score : {roc_auc_score(y_test, rf_classifier.predict_proba(X_test_bayes)[:, 1]):.3f}", end="\n\n")
print(classification_report(y_test, y_pred))
F1-Score : 0.966
AUC - ROC Score : 0.991

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2195
           1       0.97      0.96      0.97      2205

    accuracy                           0.97      4400
   macro avg       0.97      0.97      0.97      4400
weighted avg       0.97      0.97      0.97      4400

In [13]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 5), dpi=100)
sns.heatmap(cm, annot=True, fmt=".5g", cmap='Blues');

ROC curve¶

In [22]:
plot_roc_curve(rf_classifier, X_test_bayes, y_test);
<Figure size 1000x400 with 0 Axes>

ML explainability using Shap values¶

Red shows positive impact on the model.¶
Blue shows negative impact on the model.¶
In [14]:
xxx = X_test_bayes.reset_index(drop=True).loc[0:300, :]
explainer = shap.Explainer(rf_classifier.predict, xxx)
shap_values = explainer(xxx)
Permutation explainer: 302it [02:25,  1.96it/s]                                                                        
Analyzing contribution of each feature for 1st sample in test set.¶
In [15]:
shap.plots.waterfall(shap_values[0], max_display=xxx.shape[1])
Force plot for 1st sample¶
In [69]:
shap.initjs()
shap.plots.force(shap_values[0])
Out[69]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Force plot for first 300 samples¶
In [71]:
shap.initjs()
shap.plots.force(shap_values)
Out[71]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [66]:
shap.plots.beeswarm(shap_values, max_display=xxx.shape[1])
In [67]:
shap.plots.bar(shap_values, max_display=xxx.shape[1])
In [68]:
shap.summary_plot(shap_values, xxx, max_display=xxx.shape[1])
In [ ]: